************************************************************
************************************************************
**														  **
**	Tanzania, 2008-09									  **
**										   				  **
************************************************************
************************************************************

clear
version 14
set more off
cap log close

gl IN 	"${DATADRIVE}\LSMS-ISA\2008-09"
gl OUT 	"${LOCDRIVE}\Results"
gl TEMP	"${LOCDRIVE}\Tempdata"

loc hh hhid
loc ea ea
loc weight hh_weight
loc strata strataid
loc admin0 macroregion
loc admin1 region 		
loc admin2 district  	
loc admin3 ward
loc indiv sbmemno

set matsize 10000

loc admin0_base = $admin0_base
loc admin1_base = $admin1_base
loc admin2_base = $admin2_base
loc admin3_base = $admin3_base_r1

loc gooddisagglist $gooddisagglist 
loc servicedisagglist $servicedisagglist

log using "${TEMP}\DataPrep_Tanzania_R1", replace


*****************************
**** Create Sample Files ****
*****************************

use  "${IN}\SEC_A_T.dta", clear
sort `hh'
merge 1:1 `hh' using "${IN}\TZY1.HH.Consumption.dta", nogen keep(1 3) keepusing(urban)

recode urban (2=1) (1=0) /*was coded as 1 rural 2 urban*/
label values urban .

keep `hh' `ea' `weight' `strata' `admin1' `admin2' `admin3' urban
sort `hh'

gen macroregion=1 if region==1 | region==13		 /*central*/
replace macroregion=2 if region==2 | region==3 | region==4 | region==21	/* northern*/
replace macroregion=3 if region==5 | region==6 | region==7		/*eastern*/
replace macroregion=4 if region==8 | region==9 | region==10	 /*southern*/
replace macroregion=5 if region==11 | region==12 | region==15	/*southern highlands*/
replace macroregion=6 if region==14 | region==16 | region==17	/*western*/
replace macroregion=7 if region==18 | region==19 | region==20	/*lake*/
replace macroregion=8 if region>=51 & region<=55	/*zanzibar*/

tab macroregion,m
label define macroregion 1 "Central" 2 "Northern" 3 "Eastern" 4 "Southern" 5 "Southern Highlands" 6 "Western" 7 "Lakes" 8 "Zanzibar"
label values macroregion macroregion 

tempfile sample
save `sample', replace

g count = 1
collapse (sum) count, by(`ea' `admin0' `admin1' `admin2' `admin3' urban)
drop count
sort `admin0' `admin1' `admin2' `admin3' `ea'
egen ea_c = group(`admin0' `admin1' `admin2' `admin3' `ea')
tempfile sample_ea
save `sample_ea', replace

g count = 1
collapse (sum) count, by(`admin0' `admin1' `admin2' `admin3')
drop count
sort `admin0' `admin1' `admin2' `admin3'
egen admin3_c = group(`admin0' `admin1' `admin2' `admin3')
su admin3_c if `admin0' == `admin0_base' & `admin1' == `admin1_base' & `admin2' == `admin2_base' & `admin3'==`admin3_base', detail
loc admin3_cbase = r(p50)
display "`admin3_cbase'"
tempfile sample_admin3
save `sample_admin3', replace

g count = 1
collapse (sum) count, by(`admin0' `admin1' `admin2')
drop count
sort `admin0' `admin1' `admin2' 
egen admin2_c = group(`admin0' `admin1' `admin2')
su admin2_c if `admin0' == `admin0_base' & `admin1' == `admin1_base' & `admin2' == `admin2_base', detail
loc admin2_cbase = r(p50)
display "`admin2_cbase'"
tempfile sample_admin2
save `sample_admin2', replace

g count = 1
collapse (sum) count, by(`admin0' `admin1')
drop count
sort `admin0' `admin1' 
egen admin1_c = group(`admin0' `admin1')
su admin1_c if `admin0' == `admin0_base' & `admin1' == `admin1_base', detail
loc admin1_cbase = r(p50)
display "`admin1_cbase'"
tempfile sample_admin1
save `sample_admin1', replace

g count = 1
collapse (sum) count, by(`admin0')
drop count
sort `admin0' 
egen admin0_c = group(`admin0')
su admin0_c if `admin0' == `admin0_base', detail
loc admin0_cbase = r(p50)
display "`admin0_cbase'"
tempfile sample_admin0
save `sample_admin0', replace



** Household sample file

use `sample', clear
merge m:1 `admin0' `admin1' `admin2' `admin3' `ea' using `sample_ea', nogen keep(1 3) keepusing(ea_c)
merge m:1 `admin0' `admin1' `admin2' `admin3' using `sample_admin3', nogen keep(1 3) keepusing(admin3_c)
merge m:1 `admin0' `admin1' `admin2' using `sample_admin2', nogen keep(1 3) keepusing(admin2_c)
merge m:1 `admin0' `admin1' using `sample_admin1', nogen keep(1 3) keepusing(admin1_c)
merge m:1 `admin0' using `sample_admin0', nogen keep(1 3) keepusing(admin0_c)

save "${TEMP}/Sample_links_R1.dta", replace 


****************************
**** Expenditures FOOD  ****
****************************

** First generate month_year from the hh interview date

use  "${IN}\SEC_A_T.dta", clear

rename sa2q18y year_int
rename sa2q18m month_int

keep `hh' year_int month_int

sort year_int month_int
*egen month_year = group(year_int month_int)

tempfile intdate
save `intdate', replace


use "${IN}\SEC_K1.dta", clear
label list SKQ1
tab skq1,m
recode skq1 2=0 

label list SKQ2_MEAS
/*SKQ2_MEAS: 0 NA 1 KILOGRAMS 2 GRAMS 3 LITRE 4 MILLILITRE 5 PIECES*/

rename 	(skcode 	skq1 	skq2_meas 	skq2_amount 	skq3_meas 		skq3_amount 	skq4 	skq5_meas 	skq5_amount 	skq6_meas 	skq6_amount) ///
		(itemcode 	eat 	unit_total 	q_total 		unit_purchase 	q_purchase 		spend 	unit_prod 	q_prod 			unit_gift 	q_gift)
 
duplicates report `hh' itemcode 
foreach i in total purchase prod gift {
   recode unit_`i' 0=.
   }
   
*data consistency check
gen ck0=(q_total==0 | q_total==.) if eat==1  //3 obs found (2 have missing q info for any source, 1 has missing q_total but non-missing q_gift)
tab ck0
replace eat=0 if ck0==1 & q_total==0 //assume these are error
foreach i in total purchase prod gift {
   replace unit_`i'=. if ck0==1 & q_total==0
   }
replace q_total= q_gift if (q_gift!=.) & (q_gift > 0) & q_total == .  //one obs 
gen ck_=(q_total!=.) if eat==0
tab ck_  //3 out of 5 cases incorrectly recorded no consumption when food was consumed.
replace eat=1 if hhid=="11020280050603" & itemcode==808
replace eat=1 if hhid=="08030100020133" & itemcode==807
replace eat=1 if hhid=="07010090870011" & itemcode==703

foreach i in purchase prod gift { 
	gen ck_0`i'=(unit_`i'==. & q_`i' !=.) //unit is missing but q is not missing
	replace ck_0`i'=0 if unit_`i'==. & q_`i' ==0 //unit is missing but q is not 0
	gen ck_1`i'=(unit_`i' !=. & q_`i'==.) //q is missing but unit is not missing
	  
	tab ck_0`i',m 
	tab ck_1`i',m
	}
	
*fix the errors
replace unit_purchase=unit_total if ck_0purchase==1 //1 change
replace unit_prod=unit_total if ck_0prod==1 //1 change
replace unit_gift=unit_total if ck_0gift==1 //1 change
replace unit_prod=. if ck_1prod==1 //1 change

gen ck_2=0
replace ck_2=1 if spend==0 & q_purchase>0 
tab ck_2 //74 obs found: In this first wave, many errors found compared to the other waves. 
*We assume that 74 obs were actually not purchased, but from production or gift. Treat them as gift unless not otherwise accounted for in own-production (in for 4 cases it is).

replace unit_gift=unit_total if ck_2==1 
replace q_gift=q_total if ck_2==1 & q_prod==0
	replace q_gift=q_total - q_prod if ck_2==1 & q_prod>0
replace q_purchase=. if ck_2==1
replace unit_purchase=. if ck_2==1
replace spend=. if ck_2==1

*replace missing quantity with 0
foreach i in total purchase prod gift {
	recode q_`i' (.=0)
	}
	
drop ck0 ck_*

*convert units (grams->kilograms/ millilitre->litre)
tab unit_total

foreach i in total purchase prod gift { 
	replace q_`i'=q_`i'/1000 if unit_`i'==2 | unit_`i'==4 
	replace unit_`i'=1 if unit_`i'==2
	replace unit_`i'=3 if unit_`i'==4
	}
	
tempfile sec_k1_mid
save `sec_k1_mid', replace


*check if units for all sources are equal within an item 
rename (unit_total q_total unit_purchase q_purchase unit_prod q_prod unit_gift q_gift) ///
(unit_1 q_1 unit_2 q_2 unit_3 q_3 unit_4 q_4)
reshape long unit_ q_,i(`hh' itemcode) j(source)
drop if q_==0
by `hh' itemcode, sort: egen sd_= sd(unit_)
g diff= sd_ != . & sd_ > 0 /*these have only one type of cons per hh-item*/
drop sd_
tab diff //8 cases of differing units within a hh-item. Most of them seem to be errors (e.g., in 4 cases there is only one type of consumption and teh same qty is reported as kg and liter)

*fix these errors
use `sec_k1_mid', clear
replace unit_purchase=unit_total if q_total==q_purchase & unit_total!=unit_purchase /*4 of the cases*/
replace unit_purchase=unit_total if hhid=="07030120480037" & itemcode==502 /*discrepancy in unit and qty, only purchased*/
replace q_purchase=4.55 if hhid=="07030120480037" & itemcode==502 
replace q_prod=1.8 if hhid=="16020210010671" & itemcode==602 /*prod unit most likely miscoded as pieces when g was intended*/
replace unit_prod=unit_total if hhid=="16020210010671" & itemcode==602
replace q_total=q_purchase if hhid=="53020080020016" & itemcode==902 /*total unit most likely miscoded as g when kg was intended*/
replace unit_total=unit_purchase if hhid=="53020080020016" & itemcode==902
replace unit_purchase=. if hhid=="19060040030038" & itemcode==702 /*most likely only consumed as gift and not purchased -- replaced*/
replace q_purchase=.  if hhid=="19060040030038" & itemcode==702
replace q_gift=q_total  if hhid=="19060040030038" & itemcode==702
replace unit_gift=unit_total if hhid=="19060040030038" & itemcode==702


** Purchase unit values (expenditures / quantity)

gen price_purchase=spend/q_purchase
g ch_=1 if price_purchase==. & q_purchase!=. & q_purchase>0 /*only 4 - in 3 cases it seems like they really produced rather than purchasing because prod total = total. In the other case it is not elsewhere reported, move it to own prod */
replace q_purchase=. if ch_==1 & q_prod==q_total
replace unit_purchase=. if ch_==1 & q_prod==q_total
drop ch_
g ch_=1 if price_purchase==. & q_purchase!=. & q_purchase>0
replace q_prod=q_purchase if ch_==1 
replace unit_prod=unit_purchase if ch_==1 
replace q_purchase=. if ch_==1
replace unit_purchase=. if ch_==1

sort hhid
merge m:1 hhid using `sample'
drop _merge 

tempfile sec_k1_mid
save `sec_k1_mid', replace


keep if q_total>0 & q_total != . 
tempfile food_consumed_consumption
save `food_consumed_consumption', replace


** Generate median prices per kg from price_purchase (only for consumers)

keep if q_purchase > 0 & q_purchase != . 
tempfile food_purchased_consumption
save `food_purchased_consumption', replace

*by urb-region-district-ward-ea all units
preserve
collapse (median) p_purchase_adm4_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1' `admin2' `admin3' ea)
keep if p_purchase_adm4_ != . //drop missing p drawn from unit_purchase=. 
sort itemcode unit_purchase urban `admin1' `admin2' `admin3' ea
reshape wide p_purchase_adm4_, i(itemcode urban `admin0' `admin1' `admin2' `admin3' ea) j(unit_purchase)
tempfile p_purchase_adm4 
save `p_purchase_adm4', replace
restore

*generate median unit price by urb-region-district-ward
preserve
collapse (median) p_purchase_adm3_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1' `admin2' `admin3')
keep if p_purchase_adm3_ != .
sort itemcode unit_purchase urban `admin0' `admin1' `admin2' `admin3'
reshape wide p_purchase_adm3_, i(itemcode urban `admin0' `admin1' `admin2' `admin3') j(unit_purchase)
tempfile p_purchase_adm3
save `p_purchase_adm3', replace
restore

*generate median unit price by urb-region-district
preserve
collapse (median) p_purchase_adm2_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1' `admin2')
keep if p_purchase_adm2_ != .
sort itemcode unit_purchase urban `admin0' `admin1' `admin2'
reshape wide p_purchase_adm2_, i(itemcode urban `admin0' `admin1' `admin2') j(unit_purchase)
tempfile p_purchase_adm2
save `p_purchase_adm2', replace
restore

*generate median unit price by urb-region
preserve
collapse (median) p_purchase_adm1_=price_purchase, by(itemcode unit_purchase urban `admin0' `admin1')
keep if p_purchase_adm1_ != .
duplicates drop
sort itemcode unit_purchase urban `admin0' `admin1' 
reshape wide p_purchase_adm1_, i(itemcode urban `admin0' `admin1') j(unit_purchase)
tempfile p_purchase_adm1
save `p_purchase_adm1', replace
restore

*generate median unit price by macroregion
preserve
collapse (median) p_purchase_adm0_=price_purchase, by(itemcode unit_purchase urban `admin0')
keep if p_purchase_adm0_ != .
duplicates drop
sort itemcode unit_purchase urban `admin0' 
reshape wide p_purchase_adm0_, i(itemcode urban `admin0') j(unit_purchase)
tempfile p_purchase_adm0
save `p_purchase_adm0', replace
restore

*generate median unit price by urban
preserve
collapse (median) p_purchase_urbrur_=price_purchase, by(itemcode unit_purchase urban)
keep if p_purchase_urbrur_ != .
duplicates drop
sort itemcode unit_purchase urban
reshape wide p_purchase_urbrur_, i(itemcode urban) j(unit_purchase)
tempfile p_purchase_urbrur
save `p_purchase_urbrur', replace
restore

*generate median unit price by unit
preserve
collapse (median) p_purchase_unit_=price_purchase, by(itemcode unit_purchase)
keep if p_purchase_unit_ != .
duplicates drop
sort itemcode unit_purchase
reshape wide p_purchase_unit_, i(itemcode) j(unit_purchase)
tempfile p_purchase_unit
save `p_purchase_unit', replace
restore


** Merge prices for different levels from price_purchase with consumption info **

*Notes: In this section only keep housheold who consumed that item
*       Information about non-consuming households is not relevant for unit conversion. 

use `food_consumed_consumption', clear
tab q_total if unit_total == . //0 obs 
rename unit_total unit

keep hhid itemcode unit ea `admin3' `admin2' `admin1' `admin0' urban  
tempfile price_sample
save `price_sample', replace

merge m:1 itemcode ea `admin3' `admin2' `admin1' `admin0' urban using `p_purchase_adm4'
drop _merge
merge m:1 itemcode `admin3' `admin2' `admin1' `admin0' urban using `p_purchase_adm3' 
drop _merge
merge m:1 itemcode `admin2' `admin1' `admin0' urban using `p_purchase_adm2' 
drop _merge
merge m:1 itemcode `admin1' `admin0' urban using `p_purchase_adm1'
drop _merge
merge m:1 itemcode `admin0' urban using `p_purchase_adm0'
drop _merge
merge m:1 itemcode urban using `p_purchase_urbrur'
drop _merge
merge m:1 itemcode using `p_purchase_unit'
drop _merge

foreach lev in adm4 adm3 adm2 adm1 adm0 urbrur unit {
	foreach u in 1 3 5 {
		winsor2 p_purchase_`lev'_`u' if unit==`u', suffix(imp) by(itemcode) cuts(1 99)
		g dif = p_purchase_`lev'_`u'!= p_purchase_`lev'_`u'imp & p_purchase_`lev'_`u'!=. & unit==`u'
		display "level: `lev'; unit: `u'; p_purchase_`lev'_`u'"
		bysort itemcode: su p_purchase_`lev'_`u' p_purchase_`lev'_`u'imp if dif==1
		replace p_purchase_`lev'_`u' = p_purchase_`lev'_`u'imp if dif==1
		drop dif
		}
	}

drop p_purchase*imp
	
tempfile prices
save `prices', replace


** Now fill in missing prices

use `sec_k1_mid', clear
rename unit_total unit
merge m:m hhid itemcode unit using `prices'
tab q_total if _merge==1 
tab q_total if _merge==2
drop if _merge==1  // 147863 observation with no total consumption; these do not matter for the unit conversion. 
drop _merge

* generate median level price 
foreach u in 1 3 5 {
	gen price_`u'_med = p_purchase_adm4_`u' 
	}

foreach u in 1 3 5 {
	* keep median price only if there are 3 or more households that consumed that item with the same unit in the same ea
	bys itemcode unit ea `admin3' `admin2' `admin1' `admin0' urban: egen n_`u'_adm4=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm4 
	replace price_`u'_med =. if n_`u'_adm4<3
	
	* if ea level is missing, replace with ward level prices
	bys itemcode unit `admin3' `admin2' `admin1' `admin0' urban: egen n_`u'_adm3=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm3 
	replace price_`u'_med = p_purchase_adm3_`u' if price_`u'_med == . & n_`u'_adm3 >= 3 

	* if ea and ward level are missing, replace with district level prices
	bys itemcode unit `admin2' `admin1' `admin0' urban: egen n_`u'_adm2=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm2 
	replace price_`u'_med = p_purchase_adm2_`u' if price_`u'_med == . & n_`u'_adm2 >= 3 

	* if ea, ward and district level are missing, replace with region level prices
	bys itemcode unit `admin1' `admin0' urban: egen n_`u'_adm1=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm1 
	replace price_`u'_med = p_purchase_adm1_`u' if price_`u'_med == . & n_`u'_adm1 >= 3 

	* if ea, ward, district and region level are missing, replace with macro-region level prices
	bys itemcode unit `admin0' urban: egen n_`u'_adm0=count(price_purchase!=.) if unit==`u'
	tab n_`u'_adm0 
	replace price_`u'_med = p_purchase_adm0_`u' if price_`u'_med == . & n_`u'_adm0 >= 3 

	* if ea, ward, district, region and macro-region level are missing, replace with urban/rural level prices
	bys itemcode unit urban: egen n_`u'_urbrur=count(price_purchase!=.) if unit==`u'
	tab n_`u'_urbrur 
	replace price_`u'_med = p_purchase_urbrur_`u' if price_`u'_med == . & n_`u'_urbrur >= 3 

	* if all else missing, replace with unit prices
	bys itemcode unit: egen n_`u'_unit=count(price_purchase!=.) if unit==`u'
	tab n_`u'_unit 
	replace price_`u'_med = p_purchase_unit_`u' if price_`u'_med == . & n_`u'_unit >= 3 
	}

foreach u in 1 3 5 {
	count if price_`u'_med == . & unit==`u'
	** 1 for kg; 3 for liter; 5 for pieces;
	drop if price_`u'_med == . & unit==`u'
	
	winsor2 price_`u'_med if unit==`u', suffix(imp) by(itemcode) cuts(5 95)
	replace price_`u'_med=price_`u'_medimp
	}

drop p_purchase_adm4_1 - p_purchase_unit_5 *medimp
drop n_1_adm4 - n_5_unit

tempfile consumed_consumption
save `consumed_consumption', replace


** Now save price dataset

collapse (median) price_1_med price_3_med price_5_med, by(`admin0' `admin1' `admin2' `admin3' `ea' itemcode)

tempfile prices_purchased
save `prices_purchased', replace


** Convert to most common unit

use `consumed_consumption', clear

count if unit!=1 & unit!=3 //  562 obs with non-standard unit  

* Identify the most common unit for each item. In the final dataset we will convert to kg, for now liters and pieces are still OK
gen kg=1 if unit==1
gen lit=1 if unit==3
gen pie=1 if unit==5
collapse (sum) nkg=kg nlit=lit npie=pie, by(itemcode) // count the number of units of each item

*compare the number of units within item and find the the most common one:
gen common_unit=.
replace common_unit=1 if nkg>=nlit & nkg>=npie
replace common_unit=3 if nlit>nkg & nlit>npie
replace common_unit=5 if npie>nkg & npie>nlit

tab common_unit
count if common_unit==.

label var common_unit "most common unit per item: 1=kg, 3=liter, 5=pieces"
sort common_unit
by common_unit: list itemcode nkg nlit npie // check whehter we find the most common unit correctly 

drop nkg nlit npie 

tempfile commonunit_w1
save `commonunit_w1', replace

* Now convert to most common unit
use  `consumed_consumption', clear
merge m:1 itemcode using `commonunit_w1'
drop _m

* Convert unit of total consumption to the most common unit for each item  *
gen nonst = (unit!=1 & unit!=3) // 575 obs

count if nonst==1 & common_unit==1 & price_1_med==. /*120*/
count if nonst==1 & common_unit==3 & price_3_med==. /*0*/
count if nonst==1 & common_unit==5 & price_5_med==. /*0*/

* Use the median price to get an approximate value of total consumption. When consumption reported in non-standard units, we convert using the median price at the EA or higher level. We use this price to get the value of total consumption if there is a non-standard unit of consumption.
g spend_temp = q_total * price_1_med if unit==1
replace spend_temp = q_total * price_3_med if unit==3
replace spend_temp = q_total * price_5_med if unit==5


* For items whose most common unit is kg, covert all units to kg [and for liters, pieces as well]
levelsof itemcode, local(items)
foreach item in `items' {
	foreach u in 1 3 5 {
		replace q_total = spend_temp / price_`u'_med if itemcode==`item' & unit!=`u' & common_unit==`u'
		replace q_purchase = spend_temp / price_`u'_med if itemcode==`item' & unit!=`u' & common_unit==`u'
		replace unit=`u' if itemcode==`item' & common_unit==`u'
		}
	}

tempfile consumed_consumption_mid2_t
save `consumed_consumption_mid2_t', replace


** exclude outliers of quantity consumed per capita (pq_total) and of quantity purchased (pq_purchase) and of price, using distribution 

merge m:1 `hh' using "${IN}\TZY1.HH.Consumption.dta", nogen keepusing(adulteq hhsize)

g price_med = price_1_med if common_unit==1
replace price_med = price_3_med if common_unit==3
replace price_med = price_5_med if common_unit==5

foreach j in total purchase { 
	g qpc_`j' = q_`j' / adulteq
	}

levelsof itemcode, local(items)
foreach item in `items' {
	tab itemcode if itemcode==`item'
	foreach j in total purchase {	
		su qpc_`j' if itemcode==`item' & qpc_`j'>0 & qpc_`j'!=., detail /*excludes zero from the lower bound*/
		replace qpc_`j' = r(p95) if itemcode==`item' & qpc_`j'>0 & qpc_`j'!=. & qpc_`j'>r(p95)
		replace qpc_`j' = r(p5) if itemcode==`item' & qpc_`j'>0 & qpc_`j'!=. & qpc_`j'<r(p5)
		}
	su price_purchase if itemcode==`item' & q_purchase>0 & q_purchase!=., detail 
	replace price_purchase = r(p95) if itemcode==`item' & q_purchase>0 & q_purchase!=. & price_purchase>r(p95)
	replace price_purchase = r(p5) if itemcode==`item' & q_purchase>0 & q_purchase!=. & price_purchase<r(p5)
	}

replace spend=q_total*price_purchase if price_purchase != .
replace spend=q_total*price_med if price_purchase==. & spend==. & q_total > 0 & q_total!=.

duplicates tag `hh' itemcode, gen(dup)
tab dup // no duplicates 

drop price_1_med price_3_med price_5_med

tempfile consumed_consumption_mid2 
save `consumed_consumption_mid2', replace 


**  Now restore obverations for non-consumers   

use `sample', clear
merge 1:m hhid using `consumed_consumption_mid2'

tab _merge
drop if _merge!=3 //none dropped 
drop _merge 

count if unit==common_unit /*this is true for all obs*/
count


*restore obs for households who did not consume (assume missing means non-consumption)
*drop if itemcode==.
duplicates drop // 0 obs

rename spend foodcons

keep `hh' `admin0' `admin1' `admin2' `admin3' `ea' itemcode  eat q_total q_purchase foodcons unit foodcons price_purchase price_med
destring unit, replace 

*We reshape the data to wide so that we can imput q_total=0, foodcons=0 for each item

drop if itemcode==. | unit==.

levelsof itemcode, local(items)

reshape wide eat q_total q_purchase unit foodcons price_purchase price_med , i(hhid) j(itemcode)

foreach i in `items' {
	replace eat`i'=0              if eat`i'==.
	replace q_total`i'=0          if q_total`i'==.
	replace foodcons`i'=0         if foodcons`i'==.
	replace q_purchase`i'=0       if q_purchase`i'==.
	}
	
*Reshape data back to long
reshape long eat q_total q_purchase unit foodcons price_purchase price_med, i(hhid) j(itemcode)
drop if q_total==.

codebook itemcode // 42 items 
tab itemcode

g foodgroup = .
label define foodgroups 1 staples 2 pulsenutfruitveg 3 animal 4 other
label values foodgroup foodgroups
replace foodgroup = 1 if inlist(itemcode,101,102,103,104,105,106,107,108,109,110,111,112,201,202,203,204,205,206,207)
replace foodgroup = 2 if inlist(itemcode,401,501,503,504,502,601,602,603,701,702,703)
replace foodgroup = 3 if inlist(itemcode,801,802,803,804,805,806,807,808,809,810,901,902,903)
replace foodgroup = 4 if inlist(itemcode,301,302,303,704,1001,1002,1003,1004,1101,1102,1103,1104,1105,1106,1107,1108)

tempfile foodcons_item_wk
save `foodcons_item_wk', replace


** merge with prices

merge m:1 `admin0' `admin1' `admin2' `admin3' `ea' itemcode using `prices_purchased', keepusing(price_1_med price_3_med price_5_med)
tab _m
drop _m

foreach u in 1 3 5 {
	replace price_med = price_`u'_med if price_med==. & price_`u'_med!=. & unit==`u'
	}

replace foodcons = price_med * q_total if q_total > 0 & price_med==.
drop if inlist(itemcode,503,504,805,806,810,903,1103,1105,1108) /*These items are too rarely consumed*/


** Generate price unit values now

duplicates report `hh' itemcode // no duplicates  

keep `hh' itemcode foodgroup eat unit q_total q_purchase foodcons price_purchase price_med 

tempfile foodcons_item_wk
save `foodcons_item_wk', replace


save "${TEMP}/Foodcons_item_wk_W1.dta", replace

collapse (sum) foodcons, by(foodgroup `hh')

reshape wide foodcons, i(`hh') j(foodgroup)

su foodcons*, detail

foreach var in foodcons1 foodcons2 foodcons3 foodcons4 {
	replace `var' = `var'*52 /*annualized*/
	winsor2 `var', suffix(imp) cuts(0 99) 
	replace `var' = `var'imp 
	}

egen foodcons = rowtotal(foodcons1 foodcons2 foodcons3 foodcons4)

tempfile foodcons_hh
save `foodcons_hh', replace


*****************************
**** Expenditures GOODS  ****
*****************************

** Goods consumption -- includes food (section K), non-food one-week recall items (section L), non-food annual items (section M)

* Non-food goods consumption - from sections L and M (some are goods some are services)
* Section L is non-food one week 
	* --> Goods include cigarettes, matches, kerosene, gas, petrol, soap, toiletries, personal products, cleaning products, bulbs, 
* Section M is non-food 12 months 
	* --> Goods include carpets, linens, mats, mosquito nets, mattresses, sports equipment, film

import excel using "${LOCDRIVE}\Parameters\GoodServiceCodes.xlsx", clear firstrow sheet("Tanzania_R1")

tempfile goodservicecodes
save `goodservicecodes', replace

use "${IN}\SEC_L.dta", clear
rename slcode itemcode
rename slq1 purchase
rename slq2 exp

*drop if purchase==9 | purchase==. /*supposedly missing y/n on whether purchased - but we have exp data*/
replace purchase = 1 if purchase==2 & exp >0 & exp != .
replace purchase = 1 if purchase==. & exp >0 & exp != .
replace purchase = 1 if purchase==9 & exp >0 & exp != . & exp != 99

drop if purchase==2 

keep `hh' itemcode purchase exp
sort `hh'

tempfile 	 sectionL
save 		`sectionL', replace

save "${TEMP}\SectionL_raw_R1", replace


use "${IN}\SEC_M.dta", clear
rename smcode itemcode
rename smq1 purchase
rename smq2 exp
replace exp = smq3 if (itemcode == 318 | itemcode == 319) & (exp==0 | exp==.) & (smq3 != . | smq3 != 0) /*bamboo and grass - exp reports value consumed and smq3 reports value purchased. because both refer to the full 12 months I think we should use value consumed primarily.*/ 

drop if (purchase == 2) & (exp == . | exp == 0)

keep `hh' itemcode purchase exp
sort `hh'

append using `sectionL'

sort itemcode
merge m:1 itemcode using `goodservicecodes', nogen keep(1 3) keepusing(recall good service other good_* service_*)

winsor2 exp, suffix(imp) cuts(0 99) by(itemcode) /*winsorize top 1 percent*/

foreach type in service good other {
	g exp_`type' = expimp * `type' * (12/recall) /*recall is the number of months per recall period*/
	}

loc goodservicecollapselist ""
foreach type in `gooddisagglist' `servicedisagglist' {
	g exp_`type' = expimp * `type' * (12/recall) /*recall is the number of months per recall period*/
	loc goodservicecollapselist `goodservicecollapselist' exp_`type'
	}


**Export item level data for descriptive analysis later
sort `hh' itemcode
save "${TEMP}\Tanzania_GoodService_Items_R1.dta", replace

collapse (sum) exp_good exp_service exp_other `goodservicecollapselist', by(`hh')

sort `hh'
merge 1:1 `hh' using `sample', nogen keep(1 3)

winsor2 exp_good exp_service exp_other `goodservicecollapselist' , suffix(imp) cuts(0 99) by(`admin1')  /*winsorize top 1 percent by region*/

foreach g in good service other `gooddisagglist' `servicedisagglist' {
	rename exp_`g'imp exp_`g'_secLM
	drop exp_`g'
	}

sort `hh'
tempfile goodsandservices
save `goodsandservices', replace


********************************
**** Expenditures SERVICES  ****
********************************

** Services consumption
* Section F is food outside hh - 7 day value of expenditures by member on prepared food and bar and local brews --> YES
* Section J is housing --> rent? housing materials? use of water vender? --> YES
* Section C is education
* Section D is health, Section G is children elsewhere, section O is assistance and groups, section P is credit, section Q is crime/justice, section R is shocks, section S is deaths
* for Sections L and M see exp_service_secLM from above
	* --> Section L services (non-food one week recall) see GoodServiceCodes to see classification 
	* --> Section M services (non-food 12 month recall) see GoodServiceCodes to see classification 

* Food outside of house consumption (Section F)

use "${IN}\SEC_B_C_D_E1_F_G1_U.dta", clear
keep `hh' `indiv' sbq9 sf*

sort `hh'
merge m:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1')

drop if sfq1==2 | sfq1==. /*no data are lost here*/

*3: 3 exp obs w 2 not coded to 1; 5: none; 7: none; 9: none; 11: one as no; 13: one as no; 15: one as no
foreach k in 3 5 7 9 11 13 15 {
	loc j = `k' - 1
	bysort sfq`j': su sfq`k', detail
	replace sfq`k' = 0 if (sfq`k' == 99 | sfq`k' == 9) & sfq`j' != 1  /*lots of "99" and "9" values in there*/
	}
egen exp_service_food = rowtotal(sfq3 sfq5 sfq7 sfq9 sfq11 sfq13 sfq15), missing

winsor2 exp_service_food, suffix(imp) cuts(0 99) by(`admin1')
replace exp_service_food = exp_service_foodimp * 52 /*recall period is one week*/

collapse (sum) exp_service_food, by(`admin1' `hh')

winsor2 exp_service_food, suffix(imp) cuts(0 99) by(`admin1')
replace exp_service_food = exp_service_foodimp

sort `hh' 
tempfile 	 services_food
save 		`services_food', replace


* Rent and housing (Section J)

use "${IN}\SEC_H1_J_K2_O2_P1_Q1_S1.dta", clear

keep `hh' sjq*
sort `hh'
merge 1:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1')

rename sjq2 exp_rent
recode exp_rent (.=0)
replace exp_rent = exp_rent * 12 /*annualized*/

winsor2 exp_rent, suffix(imp) cuts(0 99) by(`admin1')
replace exp_rent = exp_rentimp

rename exp_rent exp_service_rent

keep `hh' exp_service_rent
sort `hh'

tempfile	 services_rent
save		`services_rent', replace


* Education (Section C)

use "${IN}\SEC_B_C_D_E1_F_G1_U.dta", clear

keep `hh' `indiv' scq14_* /*these are broken out but the tot variable is the sum of the components in all cases*/
sort `hh'
merge m:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1')

egen exp_good_educ = rowtotal(scq14_bks scq14_unif), missing /*books and uniform*/
egen exp_service_educ = rowtotal(scq14_fee scq14_trans scq14_tui scq14_contr scq14_food), missing /*fees, transportation, tuition, other, food */

winsor2 exp_service_educ exp_good_educ, suffix(imp) cuts(0 99) by(`admin1')
replace exp_service_educ = exp_service_educimp
replace exp_good_educ = exp_good_educimp

recode exp_service_educ exp_good_educ (.=0)
collapse (sum) exp_service_educ exp_good_educ, by(`hh')

keep `hh' exp_service_educ exp_good_educ
sort `hh'

tempfile exp_educ
save `exp_educ', replace


* Health (Section D)
use "${IN}\SEC_B_C_D_E1_F_G1_U.dta", clear

keep `hh' `indiv' sdq* /*these are broken out but the tot variable is the sum of the components in all cases*/
sort `hh'
merge m:1 `hh' using `sample', nogen keep(1 3) keepusing(`admin1')

egen service_health_4w = rowtotal(sdq5 sdq6), missing /*monthly*/
egen service_health_hos = rowtotal(sdq9 sdq11), missing /*annual hospital and healer expenses*/
g exp_good_health = sdq7 /*monthly*/

winsor2 service_health_4w service_health_hos exp_good_health, suffix(imp) cuts(0 99) by(`admin1')
foreach var in service_health_4w service_health_hos exp_good_health {
	replace `var' = `var'imp
	recode `var' (.=0)
	}

replace exp_good_health = exp_good_health * (52/4)
replace service_health_4w = service_health_4w * (52/4)

egen exp_service_health = rowtotal(service_health_4w service_health_hos)

collapse (sum) exp_service_health exp_good_health, by(`hh')

keep `hh' exp_service_health exp_good_health
sort `hh'

tempfile exp_health
save `exp_health', replace


*Children elsewhere (Section G) 

use "${IN}\SEC_G2.dta", clear

rename sgq11 exp_service_remit

winsor2 exp_service_remit, cuts(0 99) suffix(imp)
replace exp_service_remit = exp_service_remitimp

keep `hh' exp_service_remit

collapse (sum) exp_service_remit, by(`hh')

tempfile exp_remit
save `exp_remit', replace


*********************
**** Prices FOOD ****
*********************

** Use mean in base period and district as base #fixfixfix should I use the sample-wide mean instead?
*https://www.stata.com/statalist/archive/2003-01/msg00076.html
/*items dropped from HDS: paddy rice 101  green cob maize 103 cashew almonds other nuts 503 canned dried and wild vegetables 603 package fish 810 prepared tea coffee 1105 */

use `foodcons_item_wk', clear
g unitv = foodcons/q_total
merge m:1 hhid using `sample', nogen keepusing(urban `admin0' `admin1' `admin2' `admin3' `weight')

tempfile foodcons_item_sample
save `foodcons_item_sample', replace


* use macro region mean where few consume the item even in base region 
use `foodcons_item_sample', clear
keep if `admin0'==`admin0_base' & itemcode != .
collapse (mean) x0=q_total p0=unitv (sum) eat [aw=`weight'], by(itemcode)
rename x0 x0_adm0 
rename p0 p0_adm0 

tab eat
replace p0_adm0=. if eat < 3
count if p0_adm0==.

tempfile base_qty_price_adm0
save `base_qty_price_adm0', replace
save "${TEMP}\Base_qty_price_adm0.dta", replace


** Now generate the Fisher price indexes

use `foodcons_item_sample', clear
merge m:1 itemcode using `base_qty_price_adm0', nogen keepusing(x0_adm0 p0_adm0)

g laspeyeres_num = price_med*x0_adm0
g laspeyeres_denom = p0_adm0*x0_adm0
g paasche_num = price_med*q_total
g paasche_denom = p0_adm0*q_total 

collapse (sum) laspeyeres_num laspeyeres_denom paasche_num paasche_denom eat, by(`hh')

g fisher = sqrt((laspeyeres_num/laspeyeres_denom)*(paasche_num/paasche_denom))

winsor2 fisher, suffix(imp) cuts(1 99)
replace fisher=fisherimp

rename fisher pr_foodtot

g lpr_food = log(pr_foodtot)


** Save dataset

keep `hh' lpr_food pr_foodtot
sort `hh'  

tempfile 	 prices_foodtot
save 		`prices_foodtot', replace

use `sample', clear
merge 1:1 `hh' using `prices_foodtot',  nogen keep(1 3)

tempfile 	 prices_foodtot
save 		`prices_foodtot', replace


********************************
**** Total Expenditures ****
********************************/

use "${IN}\TZY1.HH.Consumption.dta", clear
keep `hh' expm foodIN foodOUT utilities hhexpenses health transport communic recreat educa other adulteq hhsize 

rename hhsize hhsize_adeq

*egen exp_sagg_serv = rowtotal(foodOUT utilities hhexpenses health transport communic recreat educa)
*egen exp_sagg_good = rowtotal(foodIN)
*egen exp_sagg_other = rowtotal(other)

g pcexp = expm / hhsize /*annual nominal per capita hh expenditures*/
g pcexp_adeq = expm / hhsize_adeq
rename expm hhexp

winsor2 hhexp pcexp pcexp_adeq, suffix(imp) cuts(0 99)
replace pcexp=pcexpimp
replace pcexp_adeq=pcexp_adeqimp
replace hhexp=hhexpimp 

keep `hh' hhexp pcexp pcexp_adeq hhsize hhsize_adeq /*exp_sagg**/
sort `hh'

tempfile	 expenditures
save		`expenditures', replace
*


********************
**** Covariates ****
********************


** Education variables

use "${DATADRIVE}\OtherRaw\TimeuseAll_indiv_${YEAR}.dta" /*there are 45 hhs missing a head, of those 36 have a spouse, 6 have "son/daughter" one has "grandchild" and 1 has "parent" */

g educ_yrs_head = educ_yrs if relationtohead==1
g female_head = female if relationtohead==1
g age_head = age if relationtohead==1
g educ_yrs_spouse = educ_yrs if relationtohead==2 
g age_spouse = age if relationtohead==2

sort `hh' relationtohead
collapse (firstnm) female_head age_head educ_yrs_head (max) educ_yrs_max=educ_yrs educ_yrs_spouse age_spouse, by(`hh')
count if female_head==. 
count if age_head==.
count if educ_yrs_head==.
replace educ_yrs_head = educ_yrs_spouse if educ_yrs_head==.
replace age_head = age_spouse if age_head==.

sort `hh'
keep `hh' educ_yrs_head educ_yrs_max

tempfile educ
save `educ', replace


** Dependency Variables

use "${DATADRIVE}\OtherRaw\TimeuseAll_indiv_${YEAR}.dta", clear
keep `hh' `indiv' age tobedropped

sort `hh' `indiv'
merge 1:1 `hh' `indiv' using "${IN}\SEC_B_C_D_E1_F_G1_U.dta", keepusing(sbq4)
replace age = sbq4 if age==.

preserve

g pop_dep = ( age < 15 | age > 65 ) & age != . & tobedropped==0
g pop_indep = ( age >= 15 & age <= 65 ) & age != . & tobedropped==0

collapse (sum) pop_dep pop_indep, by(`hh')
count if pop_dep==.
count if pop_dep==0 & pop_indep==0
count if pop_indep==.
count if pop_indep==0

g hh_depshare = (pop_dep + 0.1) / (pop_dep + pop_indep + 0.1) /*the 0.1 is to prevent dividing by zero if age missing from all*/
count if hh_depshare==.
	
keep `hh' hh_depshare
tempfile depshare
save `depshare', replace


** Participation Variables 

use "${DATADRIVE}\OtherRaw\TimeuseAll_hh_${YEAR}.dta", clear 

*g p_farm = h_ownfarm > 0 & h_ownfarm != .
g pcmh_farm = p_ownfarm==1 & h_ownfarm > 35 /*"Min hours participation" participation - at least 35 hours per year*/

*g p_ent = h_ownent > 0 & h_ownent != .
g pcmh_ent = p_ownent == 1 & h_ownent > 35

*g p_market = h_wagelbr > 0 & h_wagelbr != .
g pcmh_market = p_wagelbr == 1 & h_wagelbr > 35


keep `hh' pcmh_* 
tempfile pcmh
save `pcmh', replace


** Asset index

use "${IN}\SEC_N.dta", clear
/*
         401 Radio and Radio Cassette - yes
         402 Telephone(landline) - yes
         403 Telephone(mobile) - yes
         404 Refrigerator or freezer - yes
         405 Sewing Machine - yes
         406 Television - yes
         407 Video / DVD - yes
         408 Chairs - yes
         409 Sofas - yes
         410 Tables - yes
         411 Watches - yes
         412 Beds - yes
         413 Cupboards, chest-of-drawers, boxes, wardrobes, bookcases - yes
         414 Lanterns - yes
         415 Computer - yes
         416 Cooking pots, Cups, other kitchen utensils - yes
         417 Mosquito net - yes
         418 Iron (Charcoal or electric) - yes
         419 Electric/gas stove - yes
         420 Other stove - yes
         421 Water-heater - yes
         422 Record/cassette player, tape recorder - yes
         423 Complete music system - yes
         424 Books (not school books) - yes
         425 Motor Vehicles - yes
         426 Motor cycle - yes
         427 Bicycle - yes
         428 Carts
         429 Animal Cart
         430 Boat/canoe
         431 Wheel barrow
         432 Livestock
         433 Poultry
         434 Outboard engine
         435 Donkeys
         436 Fields/Land
         437 House(s) - yes
         438 Air-conditioned - yes
         439 Dish antenna/decoder - yes
         440 Hoes
         441 Spraying machine
         442 Water pumping set
         443 Reapers
         444 Tractor
         445 Trailer for tractors etc.
         446 Plough etc.
         447 Harrow
         448 Milking machine
         449 Harvesting and threshing machine
         450 Hand milling machine
         451 Coffee pulping machine
         452 Fertilizer distributor
*/
*

loc assetlist 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 437 438 439 

g keep=0
foreach asset in `assetlist' {
	replace keep = 1 if sncode==`asset'
	}

keep if keep==1
drop keep

rename snq1 count_
reshape wide count_, i(`hh') j(sncode)

foreach asset in `assetlist' {
	replace count_`asset' = 0 if count_`asset'==.
	}
	
pca count_401 - count_439

predict wealthscore
la var wealthscore "Consumptive asset index"

keep `hh' wealthscore

tempfile wealthscore
save `wealthscore', replace


** Build the covariates dataset

use `sample', clear
sort `hh'
merge 1:1 `hh' using `educ', nogen keep(1 3) keepusing(educ_yrs_head educ_yrs_max)
sort `hh'
merge 1:1 `hh' using `depshare', nogen keep(1 3) keepusing(hh_depshare)
sort `hh'
merge 1:1 `hh' using `pcmh', nogen keep(1 3) keepusing(pcmh_farm pcmh_ent pcmh_market)
sort `hh'
merge 1:1 `hh' using `wealthscore', nogen keep(1 3) keepusing(wealthscore)

tempfile covariates
save `covariates', replace


*********************************
**** Dataset Assembley ****
********************************/

* Put tegether full dataset

use `sample', clear
merge 1:1 `hh' using `foodcons_hh', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `goodsandservices', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `services_food', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `services_rent', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `exp_educ', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `exp_health', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `expenditures', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `covariates', nogen keep(1 3)
sort `hh'
merge 1:1 `hh' using `intdate', nogen keep(1 3)
sort `hh'
save "${TEMP}\Tanzania_HH_R1.dta", replace
*

**************
log close
